{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "hmnrwuHZHxss",
        "outputId": "3896a25b-0eeb-4cd5-9d26-6eba6954fe53"
      },
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "/opt/conda/envs/rllm/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
            "  from .autonotebook import tqdm as notebook_tqdm\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Dataset({\n",
            "    features: ['contestId', 'name', 'tags', 'title', 'time-limit', 'memory-limit', 'problem-description', 'input-specification', 'output-specification', 'demo-input', 'demo-output', 'note', 'test_cases', 'timeConsumedMillis', 'memoryConsumedBytes', 'score', '__index_level_0__'],\n",
            "    num_rows: 7323\n",
            "})\n"
          ]
        }
      ],
      "source": [
        "import json\n",
        "import pandas as pd\n",
        "import os\n",
        "from datasets import load_dataset\n",
        "\n",
        "ds = load_dataset(\"DenCT/codeforces-problems-7k\", split=\"train\")\n",
        "print(ds)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 2,
      "metadata": {},
      "outputs": [],
      "source": [
        "CODEFORCES_TEMPLATE = \"\"\"\n",
        "{description}\n",
        "\n",
        "Input Format:\n",
        "{input_format}\n",
        "\n",
        "Output Format:\n",
        "{output_format}\n",
        "\n",
        "Tags:\n",
        "{tags}\n",
        "\n",
        "Time Limit: {time_limit} ms\n",
        "Memory Limit: {memory_limit} MB\n",
        "\"\"\"\n",
        "\n",
        "CODEFORCES_TEMPLATE_NO_LIMIT = \"\"\"\n",
        "{description}\n",
        "\n",
        "Input Format:\n",
        "{input_format}\n",
        "\n",
        "Output Format:\n",
        "{output_format}\n",
        "\n",
        "Tags:\n",
        "{tags}\n",
        "\n",
        "Demo input: {demo_input}\n",
        "Demo output: {demo_output}\n",
        "\"\"\"\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "metadata": {
        "id": "311OF4w_K4EJ"
      },
      "outputs": [],
      "source": [
        "# func. to extract difficulty rating from tags\n",
        "def extract_difficulty(tags):\n",
        "    if pd.isnull(tags):  # Handles when tags are null\n",
        "        return 0\n",
        "    for tag in tags.split(\",\"):\n",
        "        if \"*\" in tag:  # Difficulty is marked with a '*' symbol\n",
        "            try:\n",
        "                return int(tag.replace(\"*\", \"\"))\n",
        "            except ValueError:\n",
        "                continue\n",
        "    return 0  # Default difficulty "
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 5,
      "metadata": {
        "id": "U2gCVF4ULG9l"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Train dataset size: 6128\n"
          ]
        }
      ],
      "source": [
        "dataset = []\n",
        "for entry in ds:\n",
        "    tests = entry[\"test_cases\"]\n",
        "    if len(tests) <= 1:\n",
        "        continue\n",
        "    new_entry = {\n",
        "        \"problem\": CODEFORCES_TEMPLATE_NO_LIMIT.format(\n",
        "            description=entry[\"problem-description\"],\n",
        "            input_format=entry[\"input-specification\"],\n",
        "            output_format=entry[\"output-specification\"],\n",
        "            tags=entry[\"tags\"],\n",
        "            demo_input=entry[\"demo-input\"],\n",
        "            demo_output=entry[\"demo-output\"],\n",
        "        ),\n",
        "        \"tests\": tests,\n",
        "    }\n",
        "    dataset.append(new_entry)\n",
        "\n",
        "print(f\"Train dataset size: {len(dataset)}\")\n",
        "\n",
        "output_dir = os.path.abspath(\"../../train/code\")\n",
        "output_file = os.path.join(output_dir, \"codeforces.json\")\n",
        "\n",
        "with open(output_file, \"w\") as f:\n",
        "    json.dump(dataset, f, indent=4)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 6,
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Dataset({\n",
            "    features: ['problem_id', 'url', 'title', 'rating', 'tags', 'div', 'time_limit_ms', 'memory_limit_mb', 'description', 'input', 'output', 'interaction', 'examples', 'note'],\n",
            "    num_rows: 408\n",
            "})\n"
          ]
        }
      ],
      "source": [
        "ds = load_dataset(\"Qwen/CodeElo\", split=\"test\")\n",
        "print(ds)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 6,
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Test dataset size: 408\n"
          ]
        }
      ],
      "source": [
        "def process_test_cases(raw_cases):\n",
        "    # Examples field: https://huggingface.co/datasets/Qwen/CodeElo\n",
        "    # The first element is the full input string, the second is the full output string\n",
        "    formatted_cases = []\n",
        "    for rc in raw_cases:\n",
        "        input_case = rc[0]  # Keep input as a single string\n",
        "        output_case = rc[1]  # Keep output as a single string\n",
        "        # Structure the test cases\n",
        "        formatted_cases.append({\"input\": input_case, \"output\": output_case})\n",
        "    return formatted_cases\n",
        "\n",
        "dataset = []\n",
        "for entry in ds:\n",
        "    tests =  process_test_cases(entry[\"examples\"])\n",
        "    if len(tests) == 0:\n",
        "        continue\n",
        "    new_entry = {\n",
        "        \"problem\": CODEFORCES_TEMPLATE.format(\n",
        "            description=entry[\"description\"],\n",
        "            input_format=entry[\"input\"],\n",
        "            output_format=entry[\"output\"],\n",
        "            tags=entry[\"tags\"],\n",
        "            time_limit=entry[\"time_limit_ms\"],\n",
        "            memory_limit=entry[\"memory_limit_mb\"],\n",
        "        ),\n",
        "        \"tests\": tests,\n",
        "    }\n",
        "    dataset.append(new_entry)\n",
        "\n",
        "print(f\"Test dataset size: {len(dataset)}\")\n",
        "\n",
        "output_dir = os.path.abspath(\"../../test/code\")\n",
        "output_file = os.path.join(output_dir, \"codeforces.json\")\n",
        "\n",
        "with open(output_file, \"w\") as f:\n",
        "    json.dump(dataset, f, indent=4)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": []
    }
  ],
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "display_name": "rllm",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.10.16"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}
